library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(UpSetR)
library(ensembldb)
library(apeglm)
library(ashr)
DB <- "EnsDb" # Set your DB of interest
AnnotationSpecies <- "Homo sapiens" # Set your species
ah <- AnnotationHub(hub=getAnnotationHubOption("URL")) # Bring annotation DB
# Filter annotation of interest
ahQuery <- query(ah,
pattern=c(DB, AnnotationSpecies),
ignore.case=T)
# Select the most recent data
DBName <- mcols(ahQuery) %>%
rownames() %>%
tail(1)
AnnoDb <- ah[[DBName]]
# Explore your EnsDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="TXID")
# Note: Annotation has to be done with not genome but transcripts
AnnoDb <- select(AnnoDb,
AnnoKey,
keytype="TXID",
columns=c("GENEID", "GENENAME"))
# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb) # The column 1 has to assign transcript (e.g. ENSEMBLTRANS)
## TXID GENEID GENENAME
## 1 ENST00000000233 ENSG00000004059 ARF5
## 2 ENST00000000412 ENSG00000003056 M6PR
## 3 ENST00000000442 ENSG00000173153 ESRRA
## 4 ENST00000001008 ENSG00000004478 FKBP4
## 5 ENST00000001146 ENSG00000003137 CYP26B1
## 6 ENST00000002125 ENSG00000003509 NDUFAF7
.sf files have been created from fastq data by salmon
# This code chunk needs to be written by yourself
#
# Define sample names
SampleNames <- c(paste0("Mock-rep", 1:3), paste0("SARS-CoV-2-rep", 1:3))
# Define group level
GroupLevel <- c("Mock", "Covid")
# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)
# Define a vector for comparing TPM vs Counts effect
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC
# Define .sf file path
sf <- c(paste0("../salmon_output/",
SampleNames,
".salmon_quant/quant.sf"))
# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))
# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
Group=factor(group, levels=GroupLevel),
Path=sf)
rownames(metadata) <- SampleNames
# Explore the metadata
print(metadata)
## Sample Group
## Mock-rep1 Mock-rep1 Mock
## Mock-rep2 Mock-rep2 Mock
## Mock-rep3 Mock-rep3 Mock
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid
## Path
## Mock-rep1 ../salmon_output/Mock-rep1.salmon_quant/quant.sf
## Mock-rep2 ../salmon_output/Mock-rep2.salmon_quant/quant.sf
## Mock-rep3 ../salmon_output/Mock-rep3.salmon_quant/quant.sf
## SARS-CoV-2-rep1 ../salmon_output/SARS-CoV-2-rep1.salmon_quant/quant.sf
## SARS-CoV-2-rep2 ../salmon_output/SARS-CoV-2-rep2.salmon_quant/quant.sf
## SARS-CoV-2-rep3 ../salmon_output/SARS-CoV-2-rep3.salmon_quant/quant.sf
# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames
# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
countsFromAbundance="lengthScaledTPM", # Extracts TPM
ignoreTxVersion=T)
txi_counts <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
ignoreTxVersion=T)
# tpm
head(txi_tpm$counts)
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1
## ENSG00000000003 9517.216266 7376.0754350 7413.2792 10930.427303
## ENSG00000000005 7.046572 3.9313942 0.0000 2.004902
## ENSG00000000419 878.258176 916.2463792 728.9124 1201.916522
## ENSG00000000457 711.679321 553.5118176 561.1932 777.339592
## ENSG00000000460 465.051715 352.6855963 391.7691 731.425995
## ENSG00000000938 0.000000 0.9945102 0.0000 0.000000
## SARS-CoV-2-rep2 SARS-CoV-2-rep3
## ENSG00000000003 6030.3241392 7835.437604
## ENSG00000000005 0.9863951 4.002353
## ENSG00000000419 783.6099539 991.786871
## ENSG00000000457 459.4668351 603.497683
## ENSG00000000460 377.0007259 536.931827
## ENSG00000000938 0.0000000 0.000000
dim(txi_tpm$counts)
## [1] 60217 6
# counts
head(txi_counts$counts)
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003 8991.942 7660.942 7504.000 11037.869 6193.989
## ENSG00000000005 7.000 4.000 0.000 2.000 1.000
## ENSG00000000419 889.000 928.001 730.000 1211.001 777.000
## ENSG00000000457 699.297 564.374 566.745 832.783 437.987
## ENSG00000000460 452.703 366.157 397.262 740.218 388.013
## ENSG00000000938 0.000 1.000 0.000 0.000 0.000
## SARS-CoV-2-rep3
## ENSG00000000003 7772.929
## ENSG00000000005 4.000
## ENSG00000000419 991.000
## ENSG00000000457 596.733
## ENSG00000000460 514.268
## ENSG00000000938 0.000
dim(txi_counts$counts)
## [1] 60217 6
# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) {
# Create a DESeq object (so-calledd dds)
des <- DESeqDataSetFromTximport(txi,
colData=metadata,
design=~Group)
# Create a vsd object (so-called vsd)
ves <- vst(des, blind=T)
# Output them as a list
return(list(dds=des, vsd=ves))
}
TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']]
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]
# TPM
TPM[[1]]
## class: DESeqDataSet
## dim: 60217 6
## metadata(1): version
## assays(1): counts
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
## ENSG00000288696
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003 9517 7376 7413 10930 6030
## ENSG00000000005 7 4 0 2 1
## ENSG00000000419 878 916 729 1202 784
## ENSG00000000457 712 554 561 777 459
## ENSG00000000460 465 353 392 731 377
## ENSG00000000938 0 1 0 0 0
## SARS-CoV-2-rep3
## ENSG00000000003 7835
## ENSG00000000005 4
## ENSG00000000419 992
## ENSG00000000457 603
## ENSG00000000460 537
## ENSG00000000938 0
# Counts
Counts[[1]]
## class: DESeqDataSet
## dim: 60217 6
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
## ENSG00000288696
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000003 8992 7661 7504 11038 6194
## ENSG00000000005 7 4 0 2 1
## ENSG00000000419 889 928 730 1211 777
## ENSG00000000457 699 564 567 833 438
## ENSG00000000460 453 366 397 740 388
## ENSG00000000938 0 1 0 0 0
## SARS-CoV-2-rep3
## ENSG00000000003 7773
## ENSG00000000005 4
## ENSG00000000419 991
## ENSG00000000457 597
## ENSG00000000460 514
## ENSG00000000938 0
# TPM
TPM[[2]]
## class: DESeqTransform
## dim: 60217 6
## metadata(1): version
## assays(1): ''
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
## ENSG00000288696
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform
## dim: 60217 6
## metadata(1): version
## assays(1): ''
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
## ENSG00000288696
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
List[[1]] <- estimateSizeFactors(List[[1]])
List[[1]] <- estimateDispersions(List[[1]])
List[[1]] <- nbinomWaldTest(List[[1]])
return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts)
TPM <- DESeqPrep_fn(TPM)
sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
## Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## 1.0962032 1.0446805 0.8945333 1.3649221 0.7478378
## SARS-CoV-2-rep3
## 1.0187286
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead!
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
## Sample Group Path
## <factor> <factor> <character>
## Mock-rep1 Mock-rep1 Mock ../salmon_output/Moc..
## Mock-rep2 Mock-rep2 Mock ../salmon_output/Moc..
## Mock-rep3 Mock-rep3 Mock ../salmon_output/Moc..
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid ../salmon_output/SAR..
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid ../salmon_output/SAR..
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid ../salmon_output/SAR..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
## Sample Group Path sizeFactor
## <factor> <factor> <character> <numeric>
## Mock-rep1 Mock-rep1 Mock ../salmon_output/Moc.. 1.096203
## Mock-rep2 Mock-rep2 Mock ../salmon_output/Moc.. 1.044680
## Mock-rep3 Mock-rep3 Mock ../salmon_output/Moc.. 0.894533
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid ../salmon_output/SAR.. 1.364922
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid ../salmon_output/SAR.. 0.747838
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid ../salmon_output/SAR.. 1.018729
# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))
colnames(sizeFactor) <- 'Size_Factor'
sizeFactor <- sizeFactor %>%
rownames_to_column(var="Sample") %>%
inner_join(metadata[, 1:ncol(metadata)-1], by="Sample")
sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)
# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample,
y=Size_Factor,
fill=Group,
label=Size_Factor)) +
geom_bar(stat="identity", width=0.8) +
theme_bw() +
ggtitle("Size Factors in TPM-DESeq") +
geom_text(vjust=1.5) +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")
# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
gather(Sample, Normalization_Factor) %>%
inner_join(metadata[, 1:2], by="Sample")
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors
normFactors_plot <- ggplot(normf,
aes(x=Sample, y=Normalization_Factor)) +
geom_jitter(alpha=0.5, aes(color=Group)) +
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor),
outlier.shape=NA, alpha=0.5) +
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) +
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1,
color="blue",
linetype="dashed")
# Print the normalization factor scatter plot
print(normFactors_plot)
# Print the same plot with larger y-magnification in order to observe the box plot
normFactors_plot +
ylim(0.5, 1.5)
# Assigne what to compare
GroupOfInterest <- Contrast[1]
# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {
plotPCA(inputList[[2]], # takes vsd
intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}
# Set heatmap annotation
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]
# Set a function for a correlation heatmap
QCcorrHeatmap_fn <- function(inputList, Title) {
# Extract transformed count matrix
mtx <- assay(inputList[[2]]) # takes vsd
# Calculate correlation and store in the matrix
mtx <- cor(mtx)
# Create a correlation heatmap
return(pheatmap(mtx,
annotation=HeatmapAnno,
main=paste("Sample Correlation Heatmap:",
Title)))
}
grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"),
QCPCA_fn(Counts, "QC PCA: Counts"),
nrow=2)
# TPM
QCcorrHeatmap_fn(TPM, "TPM")
# Counts
QCcorrHeatmap_fn(Counts, "Counts")
# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]])
names(desList) <- TvC
# Create a list for TPM and Counts dds
ddsList <- desList # DE without shrinkage
normal.ddsList <- desList # DE with "normal" shrinkage
ape.ddsList <- desList # DE with "apeglm" shrinkage
ash.ddsList <- desList # DE with "ashr" shrinkage
for (x in TvC) {
# Run DESeq()
ddsList[[x]] <- DESeq(desList[[x]])
print(resultsNames(ddsList[[x]]))
normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
contrast=Contrast,
type="normal")
ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="apeglm")
ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="ashr")
}
## [1] "Intercept" "Group_Covid_vs_Mock"
## [1] "Intercept" "Group_Covid_vs_Mock"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage
# Plot dispersion
for (x in TvC) {
plotDispEsts(ddsList[[x]],
ylab="Dispersion",
xlab="Mean of Normalized Counts",
main=paste("Dispersion of", x, "Input"))
}
# Set FDR threshold
alpha=0.1
# FDR threshold vector
FDRv=c("< 0.1", "> 0.1")
# Initialize lists of result tables
all.resList <- all.ddsList
# Set a function cleaning table
Sig.fn <- function(df, Input) {
df <- df %>%
rownames_to_column(var="Gene") %>%
mutate(FDR=ifelse(padj < 0.1 & !is.na(padj),
FDRv[1],
FDRv[2]),
Input=Input)
return(df)
}
for (i in shrinkage) {
if (i == "None") {
for (x in TvC) {
# Extract data frames from unshrunken lfc & clean data
all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]],
contrast=Contrast,
alpha=alpha)) %>%
Sig.fn(x)
}
} else {
# Extract data frames from shrunken lfc & clean data
for (x in TvC) {
all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
Sig.fn(x)
}
}
}
# Explore the results
summary(all.resList)
## Length Class Mode
## None 2 -none- list
## Normal 2 -none- list
## Apeglm 2 -none- list
## Ashr 2 -none- list
head(all.resList[[1]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000003 7965.2177921 0.01615004 0.1264025 0.1277667 0.89833358
## 2 ENSG00000000005 2.8239226 0.61324799 1.5975442 0.3838692 0.70107541
## 3 ENSG00000000419 899.2457234 -0.21790393 0.1438633 -1.5146597 0.12985862
## 4 ENSG00000000457 596.9849855 0.02668507 0.1574980 0.1694312 0.86545751
## 5 ENSG00000000460 461.1867396 -0.38591640 0.1724174 -2.2382680 0.02520359
## 6 ENSG00000000938 0.1595384 0.96899351 4.0804729 0.2374709 0.81229150
## padj FDR Input
## 1 0.9647431 > 0.1 TPM
## 2 NA > 0.1 TPM
## 3 0.3739816 > 0.1 TPM
## 4 0.9523724 > 0.1 TPM
## 5 0.1284334 > 0.1 TPM
## 6 NA > 0.1 TPM
head(all.resList[[1]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000003 8079.086540 0.01545565 0.1269911 0.1217065 0.90313144
## 2 ENSG00000000005 2.859546 0.61242873 1.5819550 0.3871341 0.69865693
## 3 ENSG00000000419 912.475607 -0.21780383 0.1444052 -1.5082829 0.13148215
## 4 ENSG00000000457 605.612525 0.02396882 0.1577268 0.1519642 0.87921520
## 5 ENSG00000000460 467.640244 -0.38737335 0.1721631 -2.2500366 0.02444662
## 6 ENSG00000000938 0.161090 0.96756753 4.0804729 0.2371214 0.81256259
## padj FDR Input
## 1 0.9671255 > 0.1 Counts
## 2 NA > 0.1 Counts
## 3 0.3778008 > 0.1 Counts
## 4 0.9576855 > 0.1 Counts
## 5 0.1257650 > 0.1 Counts
## 6 NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000003 7965.2177921 0.01575077 0.1232768 0.1277667 0.89833358
## 2 ENSG00000000005 2.8239226 0.12156188 0.3170565 0.3838692 0.70107541
## 3 ENSG00000000419 899.2457234 -0.21097097 0.1392877 -1.5146597 0.12985862
## 4 ENSG00000000457 596.9849855 0.02567417 0.1515329 0.1694312 0.86545751
## 5 ENSG00000000460 461.1867396 -0.36853131 0.1646435 -2.2382680 0.02520359
## 6 ENSG00000000938 0.1595384 0.03533431 0.1487968 0.2374709 0.81229150
## padj FDR Input
## 1 0.9647431 > 0.1 TPM
## 2 NA > 0.1 TPM
## 3 0.3739816 > 0.1 TPM
## 4 0.9523724 > 0.1 TPM
## 5 0.1284334 > 0.1 TPM
## 6 NA > 0.1 TPM
head(all.resList[[2]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1 ENSG00000000003 8079.086540 0.01506713 0.1237982 0.1217065 0.90313144
## 2 ENSG00000000005 2.859546 0.12255440 0.3169599 0.3871341 0.69865693
## 3 ENSG00000000419 912.475607 -0.21077061 0.1397439 -1.5082829 0.13148215
## 4 ENSG00000000457 605.612525 0.02305135 0.1516914 0.1519642 0.87921520
## 5 ENSG00000000460 467.640244 -0.36984015 0.1643650 -2.2500366 0.02444662
## 6 ENSG00000000938 0.161090 0.03501887 0.1476858 0.2371214 0.81256259
## padj FDR Input
## 1 0.9671255 > 0.1 Counts
## 2 NA > 0.1 Counts
## 3 0.3778008 > 0.1 Counts
## 4 0.9576855 > 0.1 Counts
## 5 0.1257650 > 0.1 Counts
## 6 NA > 0.1 Counts
# Set ylim: has to adjusted by users depending on data
yl <- c(-25, 25)
# Set min log2 fold change of interest
mLog <- c(-1, 1)
# Initialize a list storing MA plots
MAList <- ddsList
# Create MA plots
for (i in shrinkage) {
both.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
MAList[[i]] <- ggplot(both.df,
aes(x=baseMean, y=log2FoldChange, color=FDR)) +geom_point() + scale_x_log10() + facet_grid(~Input) +
theme_bw() +
scale_color_manual(values=c("blue", "grey")) +
ggtitle(paste("MA plot with", i)) +
ylim(yl[1], yl[2]) +
geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red")
}
# Print MA plots
MAList
## $TPM
## class: DESeqDataSet
## dim: 60217 6
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
## ENSG00000288696
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
##
## $Counts
## class: DESeqDataSet
## dim: 60217 6
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(60217): ENSG00000000003 ENSG00000000005 ... ENSG00000288683
## ENSG00000288696
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
##
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']],
all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)
# plot distribution of fdr
ggplot(both.df,
aes(x=padj, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() +
ggtitle("distribution of false discovery rate (fdr)") +
xlab("adjusted p-value (fdr)") +
ylab("count") +
geom_vline(xintercept=alpha,
color="black",
linetype="dashed",
size=1) +
scale_x_continuous(breaks=seq(0, 1, by=0.1))
# Initialize a lfc list
lfcplotList <- all.resList
# Create lfc distribution plots
for (i in shrinkage) {
lfc.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]
lfc.df$Input <- factor(lfc.df$Input, levels=TvC)
lfcplotList[[i]] <- ggplot(lfc.df, # Subset rows with FDR < alpha
aes(x=log2FoldChange, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() + ylab("Count") +
geom_vline(xintercept=c(mLog[1], mLog[2]),
color="black",
linetype="dashed",
size=1) +
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) +
xlim(-5, 5)
}
# Print the lfc plots
lfcplotList
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Count number of NA genes
type=c("Zero Counts", "Outliers", "Total NA Genes")
# Create a data frame storing NA gene number
NAstat <- both.df %>%
group_by(Input) %>%
summarize(zero=sum(is.na(log2FoldChange)),
outlier=sum(is.na(pvalue) & is.na(padj))) %>%
mutate(total=zero + outlier) %>%
gather(Type, Number, -Input) %>%
mutate(Type=factor(case_when(Type == "zero" ~ type[1],
Type == "outlier" ~ type[2],
Type == "total" ~ type[3]),
levels=type))
# Plot number of NA genes
ggplot(NAstat,
aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
geom_text(position=position_dodge(width=1), vjust=1.5) +
ggtitle("Number of NA Genes") +
ylab("Number of Genes")
# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>%
group_by(GENEID) %>%
summarize(num.trans=n_distinct(TXID))
# Create an empty list storing significant gene lfc tables
sigList <- list()
# Filter significant genes' lfc and save in the list
for (x in shrinkage) {
for (y in TvC) {
# Subset genes with FDR below alpha
sigList[[x]][[y]] <- subset(all.resList[[x]][[y]],
FDR == paste("<", alpha))
# Explore the output
print(head(sigList[[x]][[y]]))
print(dim(sigList[[x]][[y]]))
}
}
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 9 ENSG00000001084 1579.91666 0.5260867 0.1525682 3.448207 5.643208e-04
## 14 ENSG00000001561 2231.51812 0.6038548 0.1471349 4.104090 4.059087e-05
## 18 ENSG00000001630 1971.01636 -0.4025373 0.1207053 -3.334877 8.533709e-04
## 30 ENSG00000002834 1072.81969 -0.6600582 0.1426174 -4.628176 3.689008e-06
## 38 ENSG00000003393 2974.86603 -0.3422075 0.1210011 -2.828135 4.682005e-03
## 40 ENSG00000003402 96.37838 -1.8291764 0.4290495 -4.263323 2.014093e-05
## padj FDR Input
## 9 6.833483e-03 < 0.1 TPM
## 14 7.692425e-04 < 0.1 TPM
## 18 9.584329e-03 < 0.1 TPM
## 30 9.655758e-05 < 0.1 TPM
## 38 3.681661e-02 < 0.1 TPM
## 40 4.164696e-04 < 0.1 TPM
## [1] 3342 9
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 9 ENSG00000001084 1599.77911 0.5255233 0.1526996 3.441550 5.783914e-04
## 14 ENSG00000001561 2264.37496 0.6032209 0.1470826 4.101239 4.109442e-05
## 18 ENSG00000001630 1999.73142 -0.4028765 0.1217931 -3.307877 9.400612e-04
## 30 ENSG00000002834 1088.29573 -0.6611145 0.1430768 -4.620696 3.824545e-06
## 38 ENSG00000003393 3018.81754 -0.3427407 0.1220640 -2.807878 4.986904e-03
## 40 ENSG00000003402 89.65349 -1.8179235 0.4271838 -4.255600 2.084889e-05
## padj FDR Input
## 9 7.030493e-03 < 0.1 Counts
## 14 7.727078e-04 < 0.1 Counts
## 18 1.036219e-02 < 0.1 Counts
## 30 9.925706e-05 < 0.1 Counts
## 38 3.894634e-02 < 0.1 Counts
## 40 4.273907e-04 < 0.1 Counts
## [1] 3349 9
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 9 ENSG00000001084 1579.91666 0.5073449 0.1471299 3.448207 5.643208e-04
## 14 ENSG00000001561 2231.51812 0.5838044 0.1422449 4.104090 4.059087e-05
## 18 ENSG00000001630 1971.01636 -0.3934405 0.1179763 -3.334877 8.533709e-04
## 30 ENSG00000002834 1072.81969 -0.6394221 0.1381508 -4.628176 3.689008e-06
## 38 ENSG00000003393 2974.86603 -0.3344360 0.1182530 -2.828135 4.682005e-03
## 40 ENSG00000003402 96.37838 -1.4142977 0.3311581 -4.263323 2.014093e-05
## padj FDR Input
## 9 6.833483e-03 < 0.1 TPM
## 14 7.692425e-04 < 0.1 TPM
## 18 9.584329e-03 < 0.1 TPM
## 30 9.655758e-05 < 0.1 TPM
## 38 3.681661e-02 < 0.1 TPM
## 40 4.164696e-04 < 0.1 TPM
## [1] 3342 9
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 9 ENSG00000001084 1599.77911 0.5066282 0.1472061 3.441550 5.783914e-04
## 14 ENSG00000001561 2264.37496 0.5830545 0.1421609 4.101239 4.109442e-05
## 18 ENSG00000001630 1999.73142 -0.3935402 0.1189695 -3.307877 9.400612e-04
## 30 ENSG00000002834 1088.29573 -0.6401589 0.1385337 -4.620696 3.824545e-06
## 38 ENSG00000003393 3018.81754 -0.3347627 0.1192225 -2.807878 4.986904e-03
## 40 ENSG00000003402 89.65349 -1.4048001 0.3303655 -4.255600 2.084889e-05
## padj FDR Input
## 9 7.030493e-03 < 0.1 Counts
## 14 7.727078e-04 < 0.1 Counts
## 18 1.036219e-02 < 0.1 Counts
## 30 9.925706e-05 < 0.1 Counts
## 38 3.894634e-02 < 0.1 Counts
## 40 4.273907e-04 < 0.1 Counts
## [1] 3349 9
## Gene baseMean log2FoldChange lfcSE pvalue
## 9 ENSG00000001084 1579.91666 -0.4726981 0.1532595 5.643208e-04
## 14 ENSG00000001561 2231.51812 -0.5555317 0.1486431 4.059087e-05
## 18 ENSG00000001630 1971.01636 0.3698951 0.1198304 8.533709e-04
## 30 ENSG00000002834 1072.81969 0.6164075 0.1443250 3.689008e-06
## 38 ENSG00000003393 2974.86603 0.3107452 0.1190360 4.682005e-03
## 40 ENSG00000003402 96.37838 1.6180973 0.4563044 2.014093e-05
## padj FDR Input
## 9 6.833483e-03 < 0.1 TPM
## 14 7.692425e-04 < 0.1 TPM
## 18 9.584329e-03 < 0.1 TPM
## 30 9.655758e-05 < 0.1 TPM
## 38 3.681661e-02 < 0.1 TPM
## 40 4.164696e-04 < 0.1 TPM
## [1] 3342 8
## Gene baseMean log2FoldChange lfcSE pvalue
## 9 ENSG00000001084 1599.77911 -0.4720725 0.1533639 5.783914e-04
## 14 ENSG00000001561 2264.37496 -0.5550610 0.1485718 4.109442e-05
## 18 ENSG00000001630 1999.73142 0.3696152 0.1208830 9.400612e-04
## 30 ENSG00000002834 1088.29573 0.6172558 0.1447949 3.824545e-06
## 38 ENSG00000003393 3018.81754 0.3107198 0.1200109 4.986904e-03
## 40 ENSG00000003402 89.65349 1.6067050 0.4552665 2.084889e-05
## padj FDR Input
## 9 7.030493e-03 < 0.1 Counts
## 14 7.727078e-04 < 0.1 Counts
## 18 1.036219e-02 < 0.1 Counts
## 30 9.925706e-05 < 0.1 Counts
## 38 3.894634e-02 < 0.1 Counts
## 40 4.273907e-04 < 0.1 Counts
## [1] 3349 8
## Gene baseMean log2FoldChange lfcSE pvalue
## 9 ENSG00000001084 1579.91666 -0.4256113 0.1526027 5.643208e-04
## 14 ENSG00000001561 2231.51812 -0.5122028 0.1506099 4.059087e-05
## 18 ENSG00000001630 1971.01636 0.3387335 0.1181537 8.533709e-04
## 30 ENSG00000002834 1072.81969 0.5771369 0.1469019 3.689008e-06
## 38 ENSG00000003393 2974.86603 0.2820018 0.1156657 4.682005e-03
## 40 ENSG00000003402 96.37838 1.4139227 0.5036537 2.014093e-05
## padj FDR Input
## 9 6.833483e-03 < 0.1 TPM
## 14 7.692425e-04 < 0.1 TPM
## 18 9.584329e-03 < 0.1 TPM
## 30 9.655758e-05 < 0.1 TPM
## 38 3.681661e-02 < 0.1 TPM
## 40 4.164696e-04 < 0.1 TPM
## [1] 3342 8
## Gene baseMean log2FoldChange lfcSE pvalue
## 9 ENSG00000001084 1599.77911 -0.4251008 0.1526704 5.783914e-04
## 14 ENSG00000001561 2264.37496 -0.5117762 0.1505215 4.109442e-05
## 18 ENSG00000001630 1999.73142 0.3382107 0.1191277 9.400612e-04
## 30 ENSG00000002834 1088.29573 0.5778473 0.1473885 3.824545e-06
## 38 ENSG00000003393 3018.81754 0.2817658 0.1165975 4.986904e-03
## 40 ENSG00000003402 89.65349 1.3993226 0.5011852 2.084889e-05
## padj FDR Input
## 9 7.030493e-03 < 0.1 Counts
## 14 7.727078e-04 < 0.1 Counts
## 18 1.036219e-02 < 0.1 Counts
## 30 9.925706e-05 < 0.1 Counts
## 38 3.894634e-02 < 0.1 Counts
## 40 4.273907e-04 < 0.1 Counts
## [1] 3349 8
# Clean the data frames with renaming columns
for (x in shrinkage) {
# Join TPM and Counts tables by GENEID
df <- inner_join(sigList[[x]][[1]],
sigList[[x]][[2]],
by="Gene")
# Create a vector storing original column nams
my.colname1 <- colnames(sigList[[x]][[1]])[-1]
# Create a vector storing modified column names
my.colname2 <- c("GENEID",
paste0(my.colname1, "_", TvC[1]),
paste0(my.colname1, "_", TvC[2]))
# Rename the columns
colnames(df) <- my.colname2
# Add a column storing shrinkage method and drop redundant columns
df <- df %>%
mutate(Shrinkage = x) %>%
dplyr::select(-starts_with(c("lfcSE", "stat", "FDR")))
# Save the cleaned data frame in the list
sigList[[x]] <- df
# Explore the output data frame
print(head(df))
print(dim(df))
}
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSG00000001084 1579.91666 0.5260867 5.643208e-04 6.833483e-03
## 2 ENSG00000001561 2231.51812 0.6038548 4.059087e-05 7.692425e-04
## 3 ENSG00000001630 1971.01636 -0.4025373 8.533709e-04 9.584329e-03
## 4 ENSG00000002834 1072.81969 -0.6600582 3.689008e-06 9.655758e-05
## 5 ENSG00000003393 2974.86603 -0.3422075 4.682005e-03 3.681661e-02
## 6 ENSG00000003402 96.37838 -1.8291764 2.014093e-05 4.164696e-04
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 1599.77911 0.5255233 5.783914e-04 7.030493e-03
## 2 TPM 2264.37496 0.6032209 4.109442e-05 7.727078e-04
## 3 TPM 1999.73142 -0.4028765 9.400612e-04 1.036219e-02
## 4 TPM 1088.29573 -0.6611145 3.824545e-06 9.925706e-05
## 5 TPM 3018.81754 -0.3427407 4.986904e-03 3.894634e-02
## 6 TPM 89.65349 -1.8179235 2.084889e-05 4.273907e-04
## Input_Counts Shrinkage
## 1 Counts None
## 2 Counts None
## 3 Counts None
## 4 Counts None
## 5 Counts None
## 6 Counts None
## [1] 3312 12
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSG00000001084 1579.91666 0.5073449 5.643208e-04 6.833483e-03
## 2 ENSG00000001561 2231.51812 0.5838044 4.059087e-05 7.692425e-04
## 3 ENSG00000001630 1971.01636 -0.3934405 8.533709e-04 9.584329e-03
## 4 ENSG00000002834 1072.81969 -0.6394221 3.689008e-06 9.655758e-05
## 5 ENSG00000003393 2974.86603 -0.3344360 4.682005e-03 3.681661e-02
## 6 ENSG00000003402 96.37838 -1.4142977 2.014093e-05 4.164696e-04
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 1599.77911 0.5066282 5.783914e-04 7.030493e-03
## 2 TPM 2264.37496 0.5830545 4.109442e-05 7.727078e-04
## 3 TPM 1999.73142 -0.3935402 9.400612e-04 1.036219e-02
## 4 TPM 1088.29573 -0.6401589 3.824545e-06 9.925706e-05
## 5 TPM 3018.81754 -0.3347627 4.986904e-03 3.894634e-02
## 6 TPM 89.65349 -1.4048001 2.084889e-05 4.273907e-04
## Input_Counts Shrinkage
## 1 Counts Normal
## 2 Counts Normal
## 3 Counts Normal
## 4 Counts Normal
## 5 Counts Normal
## 6 Counts Normal
## [1] 3312 12
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSG00000001084 1579.91666 -0.4726981 5.643208e-04 6.833483e-03
## 2 ENSG00000001561 2231.51812 -0.5555317 4.059087e-05 7.692425e-04
## 3 ENSG00000001630 1971.01636 0.3698951 8.533709e-04 9.584329e-03
## 4 ENSG00000002834 1072.81969 0.6164075 3.689008e-06 9.655758e-05
## 5 ENSG00000003393 2974.86603 0.3107452 4.682005e-03 3.681661e-02
## 6 ENSG00000003402 96.37838 1.6180973 2.014093e-05 4.164696e-04
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 1599.77911 -0.4720725 5.783914e-04 7.030493e-03
## 2 TPM 2264.37496 -0.5550610 4.109442e-05 7.727078e-04
## 3 TPM 1999.73142 0.3696152 9.400612e-04 1.036219e-02
## 4 TPM 1088.29573 0.6172558 3.824545e-06 9.925706e-05
## 5 TPM 3018.81754 0.3107198 4.986904e-03 3.894634e-02
## 6 TPM 89.65349 1.6067050 2.084889e-05 4.273907e-04
## Input_Counts Shrinkage
## 1 Counts Apeglm
## 2 Counts Apeglm
## 3 Counts Apeglm
## 4 Counts Apeglm
## 5 Counts Apeglm
## 6 Counts Apeglm
## [1] 3312 12
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSG00000001084 1579.91666 -0.4256113 5.643208e-04 6.833483e-03
## 2 ENSG00000001561 2231.51812 -0.5122028 4.059087e-05 7.692425e-04
## 3 ENSG00000001630 1971.01636 0.3387335 8.533709e-04 9.584329e-03
## 4 ENSG00000002834 1072.81969 0.5771369 3.689008e-06 9.655758e-05
## 5 ENSG00000003393 2974.86603 0.2820018 4.682005e-03 3.681661e-02
## 6 ENSG00000003402 96.37838 1.4139227 2.014093e-05 4.164696e-04
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 1599.77911 -0.4251008 5.783914e-04 7.030493e-03
## 2 TPM 2264.37496 -0.5117762 4.109442e-05 7.727078e-04
## 3 TPM 1999.73142 0.3382107 9.400612e-04 1.036219e-02
## 4 TPM 1088.29573 0.5778473 3.824545e-06 9.925706e-05
## 5 TPM 3018.81754 0.2817658 4.986904e-03 3.894634e-02
## 6 TPM 89.65349 1.3993226 2.084889e-05 4.273907e-04
## Input_Counts Shrinkage
## 1 Counts Ashr
## 2 Counts Ashr
## 3 Counts Ashr
## 4 Counts Ashr
## 5 Counts Ashr
## 6 Counts Ashr
## [1] 3312 12
# Convert a list of data frames to one single data frame
lfcTable <- sigList[[1]]
for (i in 2:length(shrinkage)) {
lfcTable <- rbind(lfcTable, sigList[[i]])
}
# Explore the output data frame
head(lfcTable)
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSG00000001084 1579.91666 0.5260867 5.643208e-04 6.833483e-03
## 2 ENSG00000001561 2231.51812 0.6038548 4.059087e-05 7.692425e-04
## 3 ENSG00000001630 1971.01636 -0.4025373 8.533709e-04 9.584329e-03
## 4 ENSG00000002834 1072.81969 -0.6600582 3.689008e-06 9.655758e-05
## 5 ENSG00000003393 2974.86603 -0.3422075 4.682005e-03 3.681661e-02
## 6 ENSG00000003402 96.37838 -1.8291764 2.014093e-05 4.164696e-04
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 1599.77911 0.5255233 5.783914e-04 7.030493e-03
## 2 TPM 2264.37496 0.6032209 4.109442e-05 7.727078e-04
## 3 TPM 1999.73142 -0.4028765 9.400612e-04 1.036219e-02
## 4 TPM 1088.29573 -0.6611145 3.824545e-06 9.925706e-05
## 5 TPM 3018.81754 -0.3427407 4.986904e-03 3.894634e-02
## 6 TPM 89.65349 -1.8179235 2.084889e-05 4.273907e-04
## Input_Counts Shrinkage
## 1 Counts None
## 2 Counts None
## 3 Counts None
## 4 Counts None
## 5 Counts None
## 6 Counts None
dim(lfcTable)
## [1] 13248 12
# Calculate differences between TPM and Counts input data
# in baseMean, log2FoldChange, and padj
lfcTable <- lfcTable %>%
mutate(mean_TC=baseMean_TPM - baseMean_Counts,
lfc_TC=log2FoldChange_TPM - log2FoldChange_Counts,
FDR_TC=padj_TPM - padj_Counts) %>%
# Add a column storing the number of alternative transcripts
left_join(AnnoDb.ntrans, by="GENEID")
# Explore the output data frame
head(lfcTable)
## GENEID baseMean_TPM log2FoldChange_TPM pvalue_TPM padj_TPM
## 1 ENSG00000001084 1579.91666 0.5260867 5.643208e-04 6.833483e-03
## 2 ENSG00000001561 2231.51812 0.6038548 4.059087e-05 7.692425e-04
## 3 ENSG00000001630 1971.01636 -0.4025373 8.533709e-04 9.584329e-03
## 4 ENSG00000002834 1072.81969 -0.6600582 3.689008e-06 9.655758e-05
## 5 ENSG00000003393 2974.86603 -0.3422075 4.682005e-03 3.681661e-02
## 6 ENSG00000003402 96.37838 -1.8291764 2.014093e-05 4.164696e-04
## Input_TPM baseMean_Counts log2FoldChange_Counts pvalue_Counts padj_Counts
## 1 TPM 1599.77911 0.5255233 5.783914e-04 7.030493e-03
## 2 TPM 2264.37496 0.6032209 4.109442e-05 7.727078e-04
## 3 TPM 1999.73142 -0.4028765 9.400612e-04 1.036219e-02
## 4 TPM 1088.29573 -0.6611145 3.824545e-06 9.925706e-05
## 5 TPM 3018.81754 -0.3427407 4.986904e-03 3.894634e-02
## 6 TPM 89.65349 -1.8179235 2.084889e-05 4.273907e-04
## Input_Counts Shrinkage mean_TC lfc_TC FDR_TC num.trans
## 1 Counts None -19.862453 0.0005634285 -1.970095e-04 14
## 2 Counts None -32.856840 0.0006339146 -3.465262e-06 1
## 3 Counts None -28.715059 0.0003391573 -7.778599e-04 4
## 4 Counts None -15.476039 0.0010562739 -2.699484e-06 9
## 5 Counts None -43.951509 0.0005332544 -2.129730e-03 75
## 6 Counts None 6.724896 -0.0112529139 -1.092112e-05 25
dim(lfcTable)
## [1] 13248 16
# Set a function to create a vector storing plot labels
plotlabels.fn <- function(myvec, mylist, num) {
vec <- c()
for (i in 1:num) {
vec <- c(vec, c(myvec[i], rep("", nrow(mylist[[i]]) - 1)))
}
return(vec)
}
my.param <- c("baseMean", "log2FoldChange", "padj")
# Slice and clean the data frame for input
lfcTable.comp <- lfcTable %>%
dplyr::select(GENEID, num.trans, Shrinkage, starts_with(my.param)) %>%
gather(Category, Value, -GENEID, -num.trans, -Shrinkage) %>%
separate(Category, c("Metric", "Input"), sep="_") %>%
pivot_wider(names_from=Input, values_from=Value) %>%
nest(-Metric, -Shrinkage)
# Create a vector storing Rsquared values between TPM and Counts outputs
corr.vec <- round(map_dbl(lfcTable.comp$data,
~cor(.x$TPM, .x$Counts)),
7)
# Create a ggplot labeling vector converted from the Rsquared vector
rsq.vec <- plotlabels.fn(corr.vec,
lfcTable.comp$data,
length(corr.vec))
# Unnest the data frame
lfcTable.comp <- lfcTable.comp %>%
unnest(data)
# Add a column storing Rsquared labels
lfcTable.comp$Rsquared.label <- rsq.vec
# Explore the cleaned data frame
head(lfcTable.comp)
## # A tibble: 6 x 7
## Shrinkage Metric GENEID num.trans TPM Counts Rsquared.label
## <chr> <chr> <chr> <int> <dbl> <dbl> <chr>
## 1 None baseMean ENSG00000001084 14 1580. 1600. "0.9999992"
## 2 None baseMean ENSG00000001561 1 2232. 2264. ""
## 3 None baseMean ENSG00000001630 4 1971. 2000. ""
## 4 None baseMean ENSG00000002834 9 1073. 1088. ""
## 5 None baseMean ENSG00000003393 75 2975. 3019. ""
## 6 None baseMean ENSG00000003402 25 96.4 89.7 ""
# Nest the data frame by metric
lfcTable.comp <- lfcTable.comp %>%
nest(-Metric)
# Set a function creating a scatter plot
comp.scatter.fn <- function(df,
xvar,
yvar,
met,
xlabel,
ylabel) {
p <- ggplot(df, aes(x=xvar,
y=yvar,
color=log(num.trans),
label=Rsquared.label)) +
geom_point(alpha=0.5) +
theme_bw() +
ggtitle(paste("Comparison in", met, "\n(with R-Squared)")) +
geom_text(size=5,
mapping=aes(x=Inf, y=Inf),
vjust=2, hjust=3.8, color="black") +
geom_abline(slope=1, size=0.5, linetype="dashed", color="black") +
scale_color_gradient(low="blue", high="red") +
facet_wrap(~Shrinkage, scales="free") +
theme(strip.text.x=element_text(size=10)) +
xlab(paste("Input from", xlabel)) +
ylab(paste("Input from", ylabel))
return(p)
}
# Print the plots
for (i in 1:nrow(lfcTable.comp)) {
df <- lfcTable.comp$data[[i]]
print(comp.scatter.fn(df,
df$TPM,
df$Counts,
lfcTable.comp$Metric[i], "TPM", "Counts"))
}
# Transform the data frame
lfcTable.rank <- lfcTable.comp %>%
unnest(data) %>%
nest(-Metric, -Shrinkage)
# Clean and arrange the data frame
for (i in 1:length(lfcTable.rank$data)) {
df <- lfcTable.rank$data[[i]]
# Create a vector storing rank (1 to the last)
rank.vec <- 1:nrow(df)
# Make descending order if the metric = "padj" and assigne rankings
if (lfcTable.rank$Metric[[i]] == "padj") {
df <- df %>%
dplyr::arrange(TPM) %>%
mutate(rank.TPM=rank.vec) %>%
dplyr::arrange(Counts) %>%
mutate(rank.Counts=rank.vec)
# Make asending order otherwise and assign rankings
} else {
df <- df %>%
dplyr::arrange(desc(abs(TPM))) %>%
mutate(rank.TPM=rank.vec) %>%
dplyr::arrange(desc(abs(Counts))) %>%
mutate(rank.Counts=rank.vec)
}
# Save the updated data frame
lfcTable.rank$data[[i]] <- df
}
# Create a vector storing Rsquared values between TPM and Counts outputs
corr.vec <- round(map_dbl(lfcTable.rank$data,
~cor(.x$rank.TPM, .x$rank.Counts)),
7)
# Create a ggplot labeling vector converted from the Rsquared vector
rsq.vec <- plotlabels.fn(corr.vec, lfcTable.rank$data, length(corr.vec))
# Unnest the data frame
lfcTable.rank <- lfcTable.rank %>%
unnest(data)
# Add a column storing Rsquared labels
lfcTable.rank$Rsquared.label <- rsq.vec
# Explore the cleaned data frame
head(lfcTable.rank)
## # A tibble: 6 x 9
## Metric Shrinkage GENEID num.trans TPM Counts Rsquared.label rank.TPM
## <chr> <chr> <chr> <int> <dbl> <dbl> <chr> <int>
## 1 baseMe… None ENSG000001… 7 1.40e6 1.42e6 "0.999857" 1
## 2 baseMe… None ENSG000002… 1 4.29e5 4.35e5 "" 2
## 3 baseMe… None ENSG000001… 1 2.89e5 2.93e5 "" 3
## 4 baseMe… None ENSG000001… 23 2.71e5 2.75e5 "" 4
## 5 baseMe… None ENSG000002… 4 2.02e5 2.05e5 "" 5
## 6 baseMe… None ENSG000001… 10 1.09e5 1.10e5 "" 6
## # … with 1 more variable: rank.Counts <int>
# Nest the data frame by metric
lfcTable.rank <- lfcTable.rank %>%
nest(-Metric)
# Print the plots
for (i in 1:nrow(lfcTable.rank)) {
df <- lfcTable.rank$data[[i]]
pl <- comp.scatter.fn(df,
df$rank.TPM,
df$rank.Counts,
lfcTable.rank$Metric[i],
"TPM", "Counts") +
ggtitle(paste("Comparison in", lfcTable.rank$Metric[i], "Rank\n(with R-Squared)"))
print(pl)
}
# Set a function cleaning data to generate upset plots
upset.input.fn <- function(df) {
# Filter genes with valid padj
df <- subset(df, !is.na(padj)) %>%
mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes?
Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""), # What are downregulated genes?
Unchanged=ifelse(FDR == FDRv[2], Gene, ""), # What are unchanged genes?
TPM_Input=ifelse(Input == "TPM", Gene, ""), # What are the genes from TPM input?
Counts_Input=ifelse(Input == "Counts", Gene, "")) # What are the genes from Counts input?
# Create a list storing groups of interest
upsetInput <- list(Up=df$Up,
Down=df$Down,
Unchanged=df$Unchanged,
TPM_Input=df$TPM,
Counts_Input=df$Counts)
return(upsetInput)
}
upsetList <- upset.input.fn(both.df)
# Create the upset plot
upset(fromList(upsetList),
sets=names(upsetList), # What group to display
sets.x.label="Number of Genes per Group",
order.by="freq",
point.size=3,
sets.bar.color=c("red", "red","blue", "blue", "blue"),
text.scale = 1.5, number.angles=30)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.2 LTS
##
## Matrix products: default
## BLAS: /home/mira/miniconda3/envs/r/lib/libblas.so.3.8.0
## LAPACK: /home/mira/miniconda3/envs/r/lib/liblapack.so.3.8.0
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] ashr_2.2-47 apeglm_1.12.0
## [3] ensembldb_2.14.0 AnnotationFilter_1.14.0
## [5] GenomicFeatures_1.42.1 AnnotationDbi_1.52.0
## [7] UpSetR_1.4.0 gridExtra_2.3
## [9] pheatmap_1.0.12 DESeq2_1.30.1
## [11] SummarizedExperiment_1.20.0 Biobase_2.50.0
## [13] MatrixGenerics_1.2.0 matrixStats_0.58.0
## [15] GenomicRanges_1.42.0 GenomeInfoDb_1.26.0
## [17] IRanges_2.24.1 S4Vectors_0.28.1
## [19] tximport_1.18.0 forcats_0.5.1
## [21] stringr_1.4.0 dplyr_1.0.5
## [23] purrr_0.3.4 readr_1.4.0
## [25] tidyr_1.1.3 tibble_3.1.0
## [27] ggplot2_3.3.3 tidyverse_1.3.0
## [29] AnnotationHub_2.22.0 BiocFileCache_1.14.0
## [31] dbplyr_2.1.0 BiocGenerics_0.36.0
## [33] rmarkdown_2.7 data.table_1.14.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.2.1
## [3] plyr_1.8.6 lazyeval_0.2.2
## [5] splines_4.0.3 BiocParallel_1.24.0
## [7] digest_0.6.27 invgamma_1.1
## [9] htmltools_0.5.1.1 SQUAREM_2021.1
## [11] fansi_0.4.2 magrittr_2.0.1
## [13] memoise_2.0.0 Biostrings_2.58.0
## [15] annotate_1.68.0 modelr_0.1.8
## [17] askpass_1.1 bdsmatrix_1.3-4
## [19] prettyunits_1.1.1 colorspace_2.0-0
## [21] blob_1.2.1 rvest_0.3.6
## [23] rappdirs_0.3.3 haven_2.3.1
## [25] xfun_0.21 crayon_1.4.1
## [27] RCurl_1.98-1.2 jsonlite_1.7.2
## [29] genefilter_1.72.1 survival_3.2-7
## [31] glue_1.4.2 gtable_0.3.0
## [33] zlibbioc_1.36.0 XVector_0.30.0
## [35] DelayedArray_0.16.0 scales_1.1.1
## [37] mvtnorm_1.1-1 DBI_1.1.1
## [39] Rcpp_1.0.6 xtable_1.8-4
## [41] progress_1.2.2 emdbook_1.3.12
## [43] bit_4.0.4 truncnorm_1.0-8
## [45] httr_1.4.2 RColorBrewer_1.1-2
## [47] ellipsis_0.3.1 farver_2.0.3
## [49] pkgconfig_2.0.3 XML_3.99-0.5
## [51] sass_0.3.1 locfit_1.5-9.4
## [53] utf8_1.1.4 labeling_0.4.2
## [55] tidyselect_1.1.0 rlang_0.4.10
## [57] later_1.1.0.1 munsell_0.5.0
## [59] BiocVersion_3.12.0 cellranger_1.1.0
## [61] tools_4.0.3 cachem_1.0.4
## [63] cli_2.3.1 generics_0.1.0
## [65] RSQLite_2.2.3 broom_0.7.5
## [67] evaluate_0.14 fastmap_1.1.0
## [69] yaml_2.2.1 knitr_1.31
## [71] bit64_4.0.5 fs_1.5.0
## [73] mime_0.10 xml2_1.3.2
## [75] biomaRt_2.46.3 compiler_4.0.3
## [77] rstudioapi_0.13 curl_4.3
## [79] interactiveDisplayBase_1.28.0 reprex_1.0.0
## [81] geneplotter_1.68.0 bslib_0.2.4
## [83] stringi_1.5.3 highr_0.8
## [85] ps_1.5.0 lattice_0.20-41
## [87] ProtGenerics_1.22.0 Matrix_1.3-2
## [89] vctrs_0.3.6 pillar_1.5.0
## [91] lifecycle_1.0.0 BiocManager_1.30.10
## [93] jquerylib_0.1.3 irlba_2.3.3
## [95] bitops_1.0-6 httpuv_1.5.5
## [97] rtracklayer_1.50.0 R6_2.5.0
## [99] promises_1.2.0.1 MASS_7.3-53.1
## [101] assertthat_0.2.1 openssl_1.4.3
## [103] withr_2.4.1 GenomicAlignments_1.26.0
## [105] Rsamtools_2.6.0 GenomeInfoDbData_1.2.4
## [107] hms_1.0.0 grid_4.0.3
## [109] coda_0.19-4 mixsqp_0.3-43
## [111] bbmle_1.0.23.1 numDeriv_2016.8-1.1
## [113] shiny_1.6.0 lubridate_1.7.10